Topics:

  1. marginal effects – margins
  2. robust / resistance regression – robustbase
  3. sample selection models – sampleSelection
  4. spatial analysis tmap
install.packages("robustbase")
install.packages("sampleSelection")
install.packages("margins")
install.packages("tmap")
library(robustbase)
library(sampleSelection)
library(readxl)
library(tidyverse)
library(olsrr)
library(sandwich)
library(lmtest)
library(margins)
library(tmap)
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
library(sf)
Linking to GEOS 3.9.1, GDAL 3.3.0, PROJ 7.2.0; sf_use_s2() is TRUE
pzn_rent <- read_excel("data/rent-poznan.xlsx")

set.seed(123)
pzn_rent_subset <- pzn_rent %>%
  add_count(quarter, name = "quarter_count") %>%
  filter(quarter_count >= 50) %>%
  filter(price >= 500, price <= 15000, flat_area >= 15, flat_area <= 250) %>%
  sample_n(3000)
  
pzn_rent_subset
model_pzn <- lm(formula = price ~ flat_area + flat_rooms + individual + flat_furnished + 
                  flat_for_students +  flat_balcony,
                data = pzn_rent_subset)
summary(model_pzn)

Call:
lm(formula = price ~ flat_area + flat_rooms + individual + flat_furnished + 
    flat_for_students + flat_balcony, data = pzn_rent_subset)

Residuals:
    Min      1Q  Median      3Q     Max 
-5043.1  -276.5   -53.7   223.2  9772.2 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)            553.5873    30.9729  17.873  < 2e-16 ***
flat_area               20.7297     0.8218  25.226  < 2e-16 ***
flat_rooms              85.4827    19.8450   4.308 1.70e-05 ***
individualTRUE          24.8742    25.6332   0.970  0.33193    
flat_furnishedTRUE     138.6813    25.5310   5.432 6.02e-08 ***
flat_for_studentsTRUE -169.2845    25.6333  -6.604 4.71e-11 ***
flat_balconyTRUE        64.1760    20.6216   3.112  0.00188 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 544.6 on 2993 degrees of freedom
Multiple R-squared:  0.4227,    Adjusted R-squared:  0.4215 
F-statistic: 365.2 on 6 and 2993 DF,  p-value: < 2.2e-16
plot(model_pzn)

summary(model_pzn_rob)

Call:
robustbase::lmrob(formula = price ~ flat_area + flat_rooms + individual + 
    flat_furnished + flat_for_students + flat_balcony, data = pzn_rent_subset, 
    method = "MM")
 \--> method = "MM"
Residuals:
     Min       1Q   Median       3Q      Max 
-3705.20  -217.14   -13.43   255.22  9774.74 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)            752.428     27.813  27.053  < 2e-16 ***
flat_area               12.665      1.162  10.902  < 2e-16 ***
flat_rooms             139.919     19.646   7.122 1.33e-12 ***
individualTRUE          41.007     16.476   2.489   0.0129 *  
flat_furnishedTRUE      84.644     17.560   4.820 1.51e-06 ***
flat_for_studentsTRUE  -85.358     15.842  -5.388 7.67e-08 ***
flat_balconyTRUE        74.079     14.368   5.156 2.69e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Robust residual standard error: 344.6 
Multiple R-squared:  0.4267,    Adjusted R-squared:  0.4255 
Convergence in 20 IRWLS iterations

Robustness weights: 
 53 observations c(176,198,199,208,260,284,411,540,601,629,664,684,698,723,743,761,786,851,909,943,1242,1277,1328,1350,1358,1379,1449,1460,1609,1622,1841,1889,1898,1900,2062,2080,2120,2258,2297,2301,2400,2424,2445,2463,2496,2557,2625,2663,2749,2816,2883,2909,2975)
     are outliers with |weight| = 0 ( < 3.3e-05); 
 240 weights are ~= 1. The remaining 2707 ones are summarized as
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0000354 0.8664000 0.9523000 0.8848000 0.9856000 0.9990000 
Algorithmic parameters: 
       tuning.chi                bb        tuning.psi        refine.tol           rel.tol 
        1.548e+00         5.000e-01         4.685e+00         1.000e-07         1.000e-07 
        scale.tol         solve.tol       eps.outlier             eps.x warn.limit.reject 
        1.000e-10         1.000e-07         3.333e-05         4.547e-10         5.000e-01 
warn.limit.meanrw 
        5.000e-01 
     nResample         max.it       best.r.s       k.fast.s          k.max    maxit.scale      trace.lev 
           500             50              2              1            200            200              0 
           mts     compute.rd fast.s.large.n 
          1000              0           2000 
                  psi           subsampling                   cov compute.outlier.stats 
           "bisquare"         "nonsingular"         ".vcov.avar1"                  "SM" 
seed : int(0) 
plot(model_pzn_rob$model$price, model_pzn_rob$rweights)

Sample selection models:

basic selection model: Heckman’s model more advanced econometric models: copula selection models more statistical approach: not missing at random models

Generate sample data

set.seed(123)
n <- 1000 
x1 <- rnorm(n = n, mean = 0, sd = 1)
x2 <- rnorm(n = n, mean = 0, sd = 1)
errors <- MASS::mvrnorm(n = n, mu = c(0, 0), 
                        Sigma = matrix(c(1, 0.5, 0.5, 2), nrow=2),
                        empirical = T)

selmodel <- data.frame(r = 1 + x1 + x2 + errors[, 1],
                       y = 1 + x1 + errors[, 2])

selmodel$sel <- selmodel$r > 0

selection(selection = sel ~ x1 + x2,outcome = y ~ x1, data = selmodel) |> 
  summary()

Spatial

df_salaries <- read_excel("data/data-salaries.xlsx", sheet = 2) %>%
  dplyr::select(id = Kod, name = Nazwa, salaries = Wartosc)

df_real <- read_excel("data/data-real-estate.xlsx", sheet = 2) %>%
  dplyr::select(id = Kod, name = Nazwa, real = Wartosc)

df_model <- df_salaries %>%
  inner_join(df_real) %>%
  filter(real > 0)
Joining, by = c("id", "name")
plot((df_model$salaries), (df_model$real))


model1 <- lm(formula = log(real) ~ log(salaries), data = df_model)
df_model$resids <- resid(model1)
plot(model1)

powiats %>% 
  dplyr::select(id=jpt_kod_je) %>%
  left_join(df_model %>%
              mutate(id = substr(id, 1,4))) -> for_plot
Joining, by = "id"
tm_shape(for_plot) +
  tm_polygons(col = "resids", style = "jenks") +
  tm_shape(woj) + 
  tm_borders(lwd = 2)
Warning: The projection of the shape object for_plot is not known, while it seems to be projected.
Warning: Current projection of shape for_plot unknown and cannot be determined.
Warning: Current projection of shape woj unknown and cannot be determined.
Variable(s) "resids" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

Generating correlated data

library(MASS)
m <- 2
sigma <- diag(c(1,2), 2, 2)
sigma[1,2] <- sigma[2,1] <- 0.5
fake_data <- MASS::mvrnorm(n = 2000, mu=rep(0, m), Sigma = sigma, empirical = T)
cor(fake_data)
plot(fake_data)
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVG9waWNzOgoKMS4gbWFyZ2luYWwgZWZmZWN0cyAtLSBtYXJnaW5zCjIuIHJvYnVzdCAvIHJlc2lzdGFuY2UgcmVncmVzc2lvbiAtLSByb2J1c3RiYXNlCjMuIHNhbXBsZSBzZWxlY3Rpb24gbW9kZWxzIC0tIHNhbXBsZVNlbGVjdGlvbgo0LiBzcGF0aWFsIGFuYWx5c2lzIHRtYXAKCgpgYGB7cn0KaW5zdGFsbC5wYWNrYWdlcygicm9idXN0YmFzZSIpCmluc3RhbGwucGFja2FnZXMoInNhbXBsZVNlbGVjdGlvbiIpCmluc3RhbGwucGFja2FnZXMoIm1hcmdpbnMiKQppbnN0YWxsLnBhY2thZ2VzKCJ0bWFwIikKYGBgCgoKYGBge3J9CmxpYnJhcnkocm9idXN0YmFzZSkKbGlicmFyeShzYW1wbGVTZWxlY3Rpb24pCmxpYnJhcnkocmVhZHhsKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShvbHNycikKbGlicmFyeShzYW5kd2ljaCkKbGlicmFyeShsbXRlc3QpCmxpYnJhcnkobWFyZ2lucykKbGlicmFyeSh0bWFwKQpsaWJyYXJ5KHNmKQpgYGAKCgpgYGB7cn0KcHpuX3JlbnQgPC0gcmVhZF9leGNlbCgiZGF0YS9yZW50LXBvem5hbi54bHN4IikKCnNldC5zZWVkKDEyMykKcHpuX3JlbnRfc3Vic2V0IDwtIHB6bl9yZW50ICU+JQogIGFkZF9jb3VudChxdWFydGVyLCBuYW1lID0gInF1YXJ0ZXJfY291bnQiKSAlPiUKICBmaWx0ZXIocXVhcnRlcl9jb3VudCA+PSA1MCkgJT4lCiAgZmlsdGVyKHByaWNlID49IDUwMCwgcHJpY2UgPD0gMTUwMDAsIGZsYXRfYXJlYSA+PSAxNSwgZmxhdF9hcmVhIDw9IDI1MCkgJT4lCiAgc2FtcGxlX24oMzAwMCkKICAKcHpuX3JlbnRfc3Vic2V0CmBgYApgYGB7cn0KbW9kZWxfcHpuIDwtIGxtKGZvcm11bGEgPSBwcmljZSB+IGZsYXRfYXJlYSArIGZsYXRfcm9vbXMgKyBpbmRpdmlkdWFsICsgZmxhdF9mdXJuaXNoZWQgKyAKICAgICAgICAgICAgICAgICAgZmxhdF9mb3Jfc3R1ZGVudHMgKyAgZmxhdF9iYWxjb255LAogICAgICAgICAgICAgICAgZGF0YSA9IHB6bl9yZW50X3N1YnNldCkKc3VtbWFyeShtb2RlbF9wem4pCnBsb3QobW9kZWxfcHpuKQpgYGAKCmBgYHtyfQpvbHNfcGxvdF9jb29rc2RfY2hhcnQobW9kZWxfcHpuKQpgYGAKCmBgYHtyfQpvbHNfcGxvdF9kZmJldGFzKG1vZGVsX3B6bikKYGBgCgpgYGB7cn0KbW9kZWxfcHpuX3JvYiA8LSByb2J1c3RiYXNlOjpsbXJvYihmb3JtdWxhID0gcHJpY2UgfiBmbGF0X2FyZWEgKyBmbGF0X3Jvb21zICsgaW5kaXZpZHVhbCArIGZsYXRfZnVybmlzaGVkICsgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmbGF0X2Zvcl9zdHVkZW50cyArICBmbGF0X2JhbGNvbnksCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkYXRhID0gcHpuX3JlbnRfc3Vic2V0LAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbWV0aG9kID0gIk1NIikKc3VtbWFyeShtb2RlbF9wem5fcm9iKQpgYGAKCmBgYHtyfQpwbG90KG1vZGVsX3B6bl9yb2IkbW9kZWwkcHJpY2UsIG1vZGVsX3B6bl9yb2IkcndlaWdodHMpCmBgYAoKCgpgYGB7cn0KZGF0YS5mcmFtZShub25yb2J1c3QgPSBjb2VmKG1vZGVsX3B6biksIHJvYnVzdCA9IGNvZWYobW9kZWxfcHpuX3JvYikpCmBgYAoKClNhbXBsZSBzZWxlY3Rpb24gbW9kZWxzOgoKYmFzaWMgc2VsZWN0aW9uIG1vZGVsOiBIZWNrbWFuJ3MgbW9kZWwKbW9yZSBhZHZhbmNlZCBlY29ub21ldHJpYyBtb2RlbHM6IGNvcHVsYSBzZWxlY3Rpb24gbW9kZWxzCm1vcmUgc3RhdGlzdGljYWwgYXBwcm9hY2g6IG5vdCBtaXNzaW5nIGF0IHJhbmRvbSBtb2RlbHMKCgoKCkdlbmVyYXRlIHNhbXBsZSBkYXRhCgoKYGBge3J9CnNldC5zZWVkKDEyMykKbiA8LSAxMDAwIAp4MSA8LSBybm9ybShuID0gbiwgbWVhbiA9IDAsIHNkID0gMSkKeDIgPC0gcm5vcm0obiA9IG4sIG1lYW4gPSAwLCBzZCA9IDEpCmVycm9ycyA8LSBNQVNTOjptdnJub3JtKG4gPSBuLCBtdSA9IGMoMCwgMCksIAogICAgICAgICAgICAgICAgICAgICAgICBTaWdtYSA9IG1hdHJpeChjKDEsIDAuNSwgMC41LCAyKSwgbnJvdz0yKSwKICAgICAgICAgICAgICAgICAgICAgICAgZW1waXJpY2FsID0gVCkKCnNlbG1vZGVsIDwtIGRhdGEuZnJhbWUociA9IDEgKyB4MSArIHgyICsgZXJyb3JzWywgMV0sCiAgICAgICAgICAgICAgICAgICAgICAgeSA9IDEgKyB4MSArIGVycm9yc1ssIDJdKQoKc2VsbW9kZWwkc2VsIDwtIHNlbG1vZGVsJHIgPiAwCgpzZWxlY3Rpb24oc2VsZWN0aW9uID0gc2VsIH4geDEgKyB4MixvdXRjb21lID0geSB+IHgxLCBkYXRhID0gc2VsbW9kZWwpIHw+IAogIHN1bW1hcnkoKQpgYGAKCgpTcGF0aWFsCgpgYGB7cn0KZGZfc2FsYXJpZXMgPC0gcmVhZF9leGNlbCgiZGF0YS9kYXRhLXNhbGFyaWVzLnhsc3giLCBzaGVldCA9IDIpICU+JQogIGRwbHlyOjpzZWxlY3QoaWQgPSBLb2QsIG5hbWUgPSBOYXp3YSwgc2FsYXJpZXMgPSBXYXJ0b3NjKQoKZGZfcmVhbCA8LSByZWFkX2V4Y2VsKCJkYXRhL2RhdGEtcmVhbC1lc3RhdGUueGxzeCIsIHNoZWV0ID0gMikgJT4lCiAgZHBseXI6OnNlbGVjdChpZCA9IEtvZCwgbmFtZSA9IE5hendhLCByZWFsID0gV2FydG9zYykKCmRmX21vZGVsIDwtIGRmX3NhbGFyaWVzICU+JQogIGlubmVyX2pvaW4oZGZfcmVhbCkgJT4lCiAgZmlsdGVyKHJlYWwgPiAwKQoKCnBsb3QoKGRmX21vZGVsJHNhbGFyaWVzKSwgKGRmX21vZGVsJHJlYWwpKQoKbW9kZWwxIDwtIGxtKGZvcm11bGEgPSBsb2cocmVhbCkgfiBsb2coc2FsYXJpZXMpLCBkYXRhID0gZGZfbW9kZWwpCmRmX21vZGVsJHJlc2lkcyA8LSByZXNpZChtb2RlbDEpCnBsb3QobW9kZWwxKQpgYGAKCmBgYHtyfQpwb3dpYXRzIDwtIHN0X3JlYWQoZHNuID0gImRhdGEvbWFweS9wb3dpYXR5LnNocCIpCndvaiA8LSBzdF9yZWFkKGRzbiA9ICJkYXRhL21hcHkvd29qLmRiZiIpCnBsb3QocG93aWF0cyRnZW9tZXRyeSkKcGxvdCh3b2okZ2VvbWV0cnksIGFkZCA9IFQsIGx3ZCA9IDIpCmBgYAoKYGBge3J9CnBvd2lhdHMgJT4lIAogIGRwbHlyOjpzZWxlY3QoaWQ9anB0X2tvZF9qZSkgJT4lCiAgbGVmdF9qb2luKGRmX21vZGVsICU+JQogICAgICAgICAgICAgIG11dGF0ZShpZCA9IHN1YnN0cihpZCwgMSw0KSkpIC0+IGZvcl9wbG90CmBgYAoKYGBge3J9CnRtX3NoYXBlKGZvcl9wbG90KSArCiAgdG1fcG9seWdvbnMoY29sID0gInJlc2lkcyIsIHN0eWxlID0gImplbmtzIikgKwogIHRtX3NoYXBlKHdvaikgKyAKICB0bV9ib3JkZXJzKGx3ZCA9IDIpCmBgYAoKR2VuZXJhdGluZyBjb3JyZWxhdGVkIGRhdGEKCmBgYHtyfQpsaWJyYXJ5KE1BU1MpCm0gPC0gMgpzaWdtYSA8LSBkaWFnKGMoMSwyKSwgMiwgMikKc2lnbWFbMSwyXSA8LSBzaWdtYVsyLDFdIDwtIDAuNQpmYWtlX2RhdGEgPC0gTUFTUzo6bXZybm9ybShuID0gMjAwMCwgbXU9cmVwKDAsIG0pLCBTaWdtYSA9IHNpZ21hLCBlbXBpcmljYWwgPSBUKQpjb3IoZmFrZV9kYXRhKQpwbG90KGZha2VfZGF0YSkKCmBgYAoKCg==